In [1]:
!pip install catboost plotly
Requirement already satisfied: catboost in c:\users\alina\anaconda3\lib\site-packages (0.20.1)
Requirement already satisfied: plotly in c:\users\alina\anaconda3\lib\site-packages (4.1.1)
Requirement already satisfied: pandas>=0.24.0 in c:\users\alina\anaconda3\lib\site-packages (from catboost) (0.25.3)
Requirement already satisfied: six in c:\users\alina\anaconda3\lib\site-packages (from catboost) (1.13.0)
Requirement already satisfied: numpy>=1.16.0 in c:\users\alina\anaconda3\lib\site-packages (from catboost) (1.17.3)
Requirement already satisfied: graphviz in c:\users\alina\anaconda3\lib\site-packages (from catboost) (0.13.2)
Requirement already satisfied: scipy in c:\users\alina\anaconda3\lib\site-packages (from catboost) (1.4.1)
Requirement already satisfied: matplotlib in c:\users\alina\anaconda3\lib\site-packages (from catboost) (3.3.2)
Requirement already satisfied: retrying>=1.3.3 in c:\users\alina\anaconda3\lib\site-packages (from plotly) (1.3.3)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\users\alina\anaconda3\lib\site-packages (from pandas>=0.24.0->catboost) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in c:\users\alina\anaconda3\lib\site-packages (from pandas>=0.24.0->catboost) (2019.3)
Requirement already satisfied: certifi>=2020.06.20 in c:\users\alina\anaconda3\lib\site-packages (from matplotlib->catboost) (2020.6.20)
Requirement already satisfied: cycler>=0.10 in c:\users\alina\anaconda3\lib\site-packages (from matplotlib->catboost) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\users\alina\anaconda3\lib\site-packages (from matplotlib->catboost) (2.4.5)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\alina\anaconda3\lib\site-packages (from matplotlib->catboost) (1.1.0)
Requirement already satisfied: pillow>=6.2.0 in c:\users\alina\anaconda3\lib\site-packages (from matplotlib->catboost) (6.2.1)
Requirement already satisfied: setuptools in c:\users\alina\anaconda3\lib\site-packages (from kiwisolver>=1.0.1->matplotlib->catboost) (42.0.1.post20191125)

Обработка лидарных данных

Сегментация

Про лидар

А что за данные на самом деле

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import tqdm
py.init_notebook_mode(connected=True)

EQUAL_ASPECT_RATIO_LAYOUT = dict(
    margin={
        'l': 0,
        'r': 0,
        'b': 0,
        't': 0
    }, scene=dict(
    aspectmode='data'
))


def color(x, cmap='Reds'):
    cmap = plt.get_cmap(cmap)
    x = (x - np.min(x)) / np.max(x)
    
    return cmap(x)

%matplotlib inline
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The savefig.jpeg_quality rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The keymap.all_axes rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_path rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\Alina\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The animation.avconv_args rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In [3]:
ds = pd.read_csv('./snow.csv')
ds = ds.set_index(['scene_id'])
ds.head()
Out[3]:
x y z intensity ring label
scene_id
0 -11.355618 -4.206962 0.344085 0.0 23.0 1.0
0 -5.916535 -1.972164 0.283262 0.0 25.0 1.0
0 -7.410451 -2.113039 2.137792 0.0 31.0 1.0
0 -13.845870 -1.406652 0.406310 0.0 23.0 1.0
0 -8.326218 -0.346060 0.226469 0.0 22.0 1.0
  • intensity - ???
  • ring - ???

Кольцо

In [4]:
scene = ds.loc[0]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.ring, 'tab20'),
    },
    'text': scene.ring
})

py.iplot(fig)

Интенсивность

In [5]:
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.intensity, 'seismic'),
    },
    'text': scene.intensity
})

py.iplot(fig)
In [6]:
scene = ds.loc[1]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.intensity, 'seismic'),
    },
    'text': scene.ring
})

py.iplot(fig)

Отфильтруем снег

Эвристикой

In [7]:
def filter_by_intensity(intensity, limit=2):
    return intensity >= limit

filtered_scene = scene[filter_by_intensity(scene.intensity)]


fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': filtered_scene.x,
    'y': filtered_scene.y,
    'z': filtered_scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(filtered_scene.intensity, 'seismic'),
    },
    'text': scene.ring
})

py.iplot(fig)

Плохо и непонятно, будем учить

Облачные вычисления

In [8]:
from sklearn.neighbors import KDTree

class ComputeFeatures(object):
    def __init__(self, r=1.0):
        self.xyz = None
        self.intensity = None
        self.ring = None
        self.index = None
        self.r = r

    def _feature_names(self):
        return ['number_of_neighbors', 'mean_intensity', 'max_intensity', 'min_intensity', 'std_intensity',
                'median_ring', 'max_ring', 'min_ring', 'std_ring']

        
    def compute_point_features(self, point_id, neighbours):
        number_of_neighbors = len(neighbours)
        
        mean_intensity, max_intensity, min_intensity, std_intensity = np.mean(self.intensity[neighbours]), \
        np.max(self.intensity[neighbours]), np.min(self.intensity[neighbours]), np.std(self.intensity[neighbours])
        
        median_ring, max_ring, min_ring, std_ring = np.median(self.ring[neighbours]), \
        np.max(self.ring[neighbours]), np.min(self.ring[neighbours]), np.std(self.ring[neighbours])
        return number_of_neighbors, mean_intensity, max_intensity, min_intensity, std_intensity, median_ring,\
                max_ring, min_ring, std_ring
    
    
    def get_point_neighbours(self, point_id):
        return self.index.query_radius(self.xyz[point_id][np.newaxis, :], r=self.r)[0]
        
    def __call__(self, xyz, intensity, ring):
        self.xyz = xyz[:]
        self.intensity = intensity[:]
        self.ring = ring[:]
        
        self.index = KDTree(self.xyz)
        
        features = []
        for point_id in range(len(self.xyz)):
            neighbours = self.get_point_neighbours(point_id)
            features.append(self.compute_point_features(point_id, neighbours))
        
        
        return pd.DataFrame(columns=self._feature_names(), data=features)
    
In [9]:
# ds_features = pd.read_csv('./snow_features.csv')
# ds_features = ds_features.drop(["Unnamed: 0"], axis=1)
# ds_features.shape

Посчитаем фичи

In [10]:
# features = ComputeFeatures(r=1.0)

# for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique()):
#     scene = ds.loc[scene_id]
#     features_df = \
#         features(scene[['x', 'y', 'z']].values, scene.intensity.values, scene.ring.values)
#     features_df.to_csv('./features/{}.csv'.format(scene_id))

Посмотрим на разметку

In [11]:
scene = ds.loc[1]

fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 1,
        'color': color(scene.label, 'seismic'),
    },
    'text': scene.label
})

py.iplot(fig)

Поучим что-нибудь

In [12]:
all_features = []

for scene_id in tqdm.tqdm(ds.reset_index().scene_id.unique()):
    features = pd.read_csv('./features/{}.csv'.format(scene_id), index_col=None)
    features.drop(['Unnamed: 0'], axis=1, inplace=True)
    all_features.append(features)
    
all_features = pd.concat(all_features, ignore_index=True)
100%|████████████████████████████████████████████████████████████████████████████████| 291/291 [00:28<00:00, 10.28it/s]
In [13]:
all_features = pd.concat([ds.reset_index(), all_features], axis=1)
all_features
Out[13]:
scene_id x y z intensity ring label number_of_neighbors mean_intensity max_intensity min_intensity std_intensity median_ring max_ring min_ring std_ring
0 0 -11.355618 -4.206962 0.344085 0.0 23.0 1.0 1 0.000000 0.0 0.0 0.000000 23.0 23.0 23.0 0.000000
1 0 -5.916535 -1.972164 0.283262 0.0 25.0 1.0 1 0.000000 0.0 0.0 0.000000 25.0 25.0 25.0 0.000000
2 0 -7.410451 -2.113039 2.137792 0.0 31.0 1.0 1 0.000000 0.0 0.0 0.000000 31.0 31.0 31.0 0.000000
3 0 -13.845870 -1.406652 0.406310 0.0 23.0 1.0 1 0.000000 0.0 0.0 0.000000 23.0 23.0 23.0 0.000000
4 0 -8.326218 -0.346060 0.226469 0.0 22.0 1.0 1 0.000000 0.0 0.0 0.000000 22.0 22.0 22.0 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7001786 290 1.391711 27.237587 -1.599782 0.0 21.0 0.0 9 0.666667 2.0 0.0 0.816497 19.0 21.0 19.0 0.993808
7001787 290 -0.498851 23.009487 -1.622276 0.0 23.0 0.0 4 0.250000 1.0 0.0 0.433013 23.0 23.0 23.0 0.000000
7001788 290 1.079074 21.118877 -1.614878 0.0 24.0 0.0 17 0.058824 1.0 0.0 0.235294 22.0 24.0 22.0 0.998268
7001789 290 -1.918804 20.268629 -1.674551 0.0 25.0 0.0 7 0.000000 0.0 0.0 0.000000 25.0 25.0 25.0 0.000000
7001790 290 -0.264756 12.211845 -1.725581 0.0 31.0 0.0 26 0.000000 0.0 0.0 0.000000 31.0 31.0 30.0 0.486504

7001791 rows × 16 columns

In [14]:
from sklearn.model_selection import train_test_split
In [15]:
train, test = train_test_split(all_features, test_size = 0.2, shuffle = True)
val, test = train_test_split(test, test_size = 0.5, shuffle = True)
In [16]:
import catboost

def learn(X_train, X_val, y_train, y_val):
    clf = catboost.CatBoostClassifier(n_estimators=100)
    clf.fit(
        X_train, y_train, early_stopping_rounds=10,
        use_best_model=True, eval_set=(X_val.values, y_val.values), plot=True, verbose=False)
    return clf

X_train = train.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_train = train.label


X_val = val.drop(["scene_id", "label", "x", "y", "z"], axis=1)
y_val = val.label
In [17]:
del ds
cls = learn(X_train, X_val, y_train, y_val)
In [18]:
X_test = test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1)
y_test = test.label

from sklearn.metrics import precision_recall_curve, precision_score, recall_score, auc

def test_one(clf, X_test, y_test):
    y_test_hat = clf.predict_proba(X_test)
    pr, rec, thr = precision_recall_curve(y_test, y_test_hat[:, 1])
    ix = np.linspace(1, len(pr)-1, num=2000).astype(int)
    return pr[ix], rec[ix], thr[ix - 1]


def heuristic_filter_scoring():
    pr = []
    rec = []
    filter_range = range(1, 10)
    for i in filter_range:
        y_test_heuristic_hat = np.ones(len(X_test))
        y_test_heuristic_hat[filter_by_intensity(test.intensity, i)] = 0
        pr.append(precision_score(y_test, y_test_heuristic_hat))
        rec.append(recall_score(y_test, y_test_heuristic_hat))
        
    return pr, rec, filter_range

pr_bl, rec_bl, thr_bl = heuristic_filter_scoring()

def plot_pr_rec(*models):
    traces = []
    for model, clf, X_test, y_test in models:
        pr, rec, thr = test_one(clf, X_test, y_test)
        pr_rec = go.Scattergl(x = rec, y = pr, mode='lines', text=thr, name=f'{model}')
        traces.append(pr_rec)
        print(f"AUC for catboost classifier {model}: ", auc(rec, pr))

    pr_rec_bl = go.Scatter(x = rec_bl, y = pr_bl, mode='lines+markers', text=list(map(str, thr_bl)), name='Intensity BL')
    
    layout = go.Layout(
        title='Precission-recall',
        xaxis=dict(
            title='Recall'
        ),
        yaxis=dict(
            title='Precission'
        ))
    fig = go.Figure(
        data=traces + [pr_rec_bl],
        layout=layout)
    py.iplot(fig)
    
models = [('Catboost classifier', cls, X_test, y_test)]
plot_pr_rec(*models)
AUC for catboost classifier Catboost classifier:  0.9961879261384519

Повизуализируем

In [19]:
y_test_hat = cls.predict_proba(test.drop(['scene_id', 'x', 'y', 'z', 'label'], axis=1))
In [20]:
scene_id = 10
scene = test.set_index(['scene_id']).loc[scene_id]
scene_predictions = y_test_hat[test.scene_id == scene_id][:, 1]
In [21]:
fig = go.Figure(layout=EQUAL_ASPECT_RATIO_LAYOUT)
fig.add_scatter3d(**{
    'x': scene.x,
    'y': scene.y,
    'z': scene.z,
    'mode': 'markers',
    'marker': {
        'size': 2,
        'color': color((np.round(scene_predictions)==scene.label).astype(int), 'tab20b'),
    },
    'text': [f"true: {true}, predicted: {round(pred)}" for pred,
             true in zip(scene_predictions, scene.label)]
})

py.iplot(fig)